
## To only replicate visualisation, see below

library(tidyverse)
library(readr)
library(MASS)
library(randomForest)
library(data.table)
library(forcats)

# We noticed that data.table::fread has reliability issues when reading directly 
# from compressed zip files. The function may only read partial data (we observed it reading 
# approximately half the table).
# Please extract the `corpora.csv.zip` file using the `unzip()` function 
# as indicated below before running "fread()"

unzip("data/raw/corpora.csv.zip", files = "corpora.csv", exdir = "data/raw/")

corpora <- 
  data.table::fread("data/raw/corpora.csv",
                    colClasses = c("character", "NULL", "POSIXct"), 
                    header = TRUE)

topics_selection <-
  read_csv("data/processed/topics_selection.csv")

blog_corpus_join_id.df <- 
  data.table::fread("data/processed/pwdprotect_blog_corpus_join_id.csv.gz")

## LT

edgelist_topic_comment_LT.el <-
  data.table::fread("data/processed/comment_LT_edgelist.csv") %>%
  dplyr::left_join(corpora, by = c(source = "corpus_id")) %>%
  dplyr::filter(posix >= as.Date("2012-01-01")) %>%
  dplyr::select(source, target, weight)

edgelist_topic_comment_LT.wide <-
  edgelist_topic_comment_LT.el %>%
  dplyr::filter(target %in% 
                  topics_selection$`Topic name LT`[
                    topics_selection$`Pass LT model` == 1]) %>%
  dcast(source ~ target, value.var = "weight")

edgelist_topic_comment_LT.wide <-
  edgelist_topic_comment_LT.wide %>%
  dplyr::filter(!source %in% 
                  blog_corpus_join_id.df$corpus_id[
                    blog_corpus_join_id.df$username == "Beppe Grillo"])

LT_predictors <- 
  "berlusconi_fininvest_soldi + berlusconi_legge_lodo + cina_petrolio_pianeta + 
  de_magistris_genchi + euro_pensione_pagare + giornali_pubblicita_rete + 
  grecia_debito_euro + italia_nucleare_tav + mafia_ciancimino_palermo + 
  parlamento_legge_pulito + partiti_cittadini_politica + pd_partito_sinistra + 
  politica_cittadini_legge + politica_espandi_comprimi + 
  presidente_partito_senatore + processo_corte_tribunale + 
  reato_prescrizione_processo + rifiuti_raccolta_ambientale + 
  soldi_banche_telecom + stare_bambino_donne + stelle_movimento_lista"

edgelist_topic_comment_LT.wide$platform <- 
  factor(substr(edgelist_topic_comment_LT.wide$source, 1, 2))

# edgelist_topic_comment_LT.wide <-
#   edgelist_topic_comment_LT.wide %>%
#   dplyr::mutate(meetup = grepl("^mu", source),
#                 forum = grepl("^fo", source),
#                 blog = grepl("^bl", source),
#                 facebook = grepl("^fa", source))

df <- 
  edgelist_topic_comment_LT.wide

set.seed(28100)

train <- 
  c(
    sample(df$source[df$platform == "mu"], 
           size = 2000),
    sample(df$source[df$platform == "bl"], 
           size = 2000),
    sample(df$source[df$platform == "fo"], 
           size = 2000),
    sample(df$source[df$platform == "fa"], 
           size = 2000)
  )


test <- 
  df$source[!df$source %in% train]

train.df <- 
  df[df$source %in% train,]

test.df <- 
  df[df$source %in% test,]

### RF

# platform.model.rf <- 
#   randomForest(as.formula(paste("platform ~", LT_predictors)), 
#                data=train.df, ntree=5000, mtry=21, importance=TRUE)
# 
# test.df$pred.platform.rf <- 
#   predict(platform.model.rf, test.df, type="response")
# 
# table(test.df$platform, test.df$pred.platform.rf)
# 
# prop.table(table(test.df$platform, test.df$pred.platform.rf), 1)


### LDA

require(MASS)

platform.model.lda <- 
  MASS::lda(as.formula(paste("platform ~", LT_predictors)), data=train.df)

test.df$pred.platform.lda <-
  predict(platform.model.lda, test.df)$class

table(test.df$platform, test.df$pred.platform.lda)

prop.table(table(test.df$platform, test.df$pred.platform.lda), 1)

test.df %>%
  dplyr::select(source, platform, pred.platform.lda) %>%
  readr::write_rds("data/classify_platform_from_LT_topics.rds",
                   compress = 'gz')

## BT

edgelist_topic_comment_BT.el <-
  data.table::fread("data/processed/comment_BT_edgelist.csv.gz") %>%
  dplyr::left_join(corpora, by = c(source = "corpus_id")) %>%
  dplyr::filter(posix >= as.Date("2012-01-01")) %>%
  dplyr::select(source, target, weight)

edgelist_topic_comment_BT.wide <-
  edgelist_topic_comment_BT.el %>%
  dplyr::filter(target %in% 
                  topics_selection$`Topic name BT`[
                    topics_selection$`Pass BT model` == 1]) %>%
  dcast(source ~ target, value.var = "weight")

edgelist_topic_comment_BT.wide <-
  edgelist_topic_comment_BT.wide %>%
  dplyr::filter(!source %in% 
                  blog_corpus_join_id.df$corpus_id[
                    blog_corpus_join_id.df$username == "Beppe Grillo"])

BT_predictors <- 
  "berlusconi_nord_mafia + berlusconi_presidente_legge + blog_grillo_post + 
  evasione_iva_fiscale + guerra_armi_pianeta + italia_grecia_debito + 
  legge_referendum_cittadini + movimento_politica_stelle + 
  nucleare_energia_centrali + pd_pdl_pietro + politica_popolo_politici + 
  tv_rai_italia"

edgelist_topic_comment_BT.wide$platform <- 
  factor(substr(edgelist_topic_comment_BT.wide$source, 1, 2))

df <- 
  edgelist_topic_comment_BT.wide

set.seed(28100)

train <- 
  c(
    sample(df$source[df$platform == "mu"], 
           size = 2000),
    sample(df$source[df$platform == "bl"], 
           size = 2000),
    sample(df$source[df$platform == "fo"], 
           size = 2000),
    sample(df$source[df$platform == "fa"], 
           size = 2000)
  )


test <- 
  df$source[!df$source %in% train]

train.df <- 
  df[df$source %in% train,]

test.df <- 
  df[df$source %in% test,]

### RF

# platform.model.rf <- 
#   randomForest(as.formula(paste("platform ~", BT_predictors)), 
#                data=train.df, ntree=5000, mtry=12, importance=TRUE)
# 
# test.df$pred.platform.rf <- 
#   predict(platform.model.rf, test.df, type="response")
# 
# table(test.df$platform, test.df$pred.platform.rf)
# 
# prop.table(table(test.df$platform, test.df$pred.platform.rf), 1)

### LDA

require(MASS)

platform.model.lda <- 
  MASS::lda(as.formula(paste("platform ~", BT_predictors)), data=train.df)

test.df$pred.platform.lda <-
  predict(platform.model.lda, test.df)$class

table(test.df$platform, test.df$pred.platform.lda)

prop.table(table(test.df$platform, test.df$pred.platform.lda), 1)

# test.df %>%
#   dplyr::select(source, platform, pred.platform.lda) %>%
#   write_rds("data/processed/classify_platform_from_BT_topics.rds", 
#             compress = 'gz')

## Visualisation To replicate plots only, run below

LT_topics <- read_rds("data/processed/classify_platform_from_LT_topics.rds")

BT_topics <- read_rds("data/processed/classify_platform_from_BT_topics.rds")

map(c('output/figures/figure_3_b.eps', 'output/figures/figure_3_b.png'),
    ~ ggsave(.x, plot =
               dplyr::bind_rows(
                 
                 # prop.table(table(LT_topics$platform, LT_topics$pred.platform.rf), 1) %>%
                 #   as.data.frame() %>%
                 #   reshape2::melt() %>%
                 #   dplyr::mutate(model = "Random Forest LT"),
                 
                 prop.table(table(LT_topics$platform, LT_topics$pred.platform.lda), 1) %>%
                   as.data.frame() %>%
                   reshape2::melt() %>%
                   dplyr::mutate(model = "Linear Discriminant Analysis LT"),
                 
                 # prop.table(table(BT_topics$platform, BT_topics$pred.platform.rf), 1) %>%
                 #   as.data.frame() %>%
                 #   reshape2::melt() %>%
                 #   dplyr::mutate(model = "Random Forest BT"),
                 
                 prop.table(table(BT_topics$platform, BT_topics$pred.platform.lda), 1) %>%
                   as.data.frame() %>%
                   reshape2::melt() %>%
                   dplyr::mutate(model = "Linear Discriminant Analysis BT")) %>%
               
               dplyr::mutate(Var1 = fct_recode(Var1,
                                               Meetup = "mu",
                                               Facebook = "fa",
                                               Blog = "bl",
                                               Forum = "fo"),
                             Var2 = fct_recode(Var2,
                                               Meetup = "mu",
                                               Facebook = "fa",
                                               Blog = "bl",
                                               Forum = "fo")) %>%
               
               
               ggplot(aes(x = Var1, y = Var2, fill = value*100)) +
               geom_tile() +
               scale_fill_distiller(palette = "Spectral") +
               geom_text(aes(label = paste0(round(value*100, 0), "%"))) +
               theme_bw() +
               facet_wrap("model") +
               labs(x = "ground truth", y = "prediction", fill = "% accuracy"),
             
             width = 25, height = 15, units = 'cm'
    )
)